{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "findspark.init()\n",
    "\n",
    "from pyspark import SparkContext\n",
    "\n",
    "spark = SparkSession\\\n",
    "    .builder\\\n",
    "    .appName(\"spark mysql\")\\\n",
    "    .getOrCreate()\n",
    "\n",
    "from pyspark.mllib.fpm import FPGrowth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FreqItemset(items=['a'], freq=4)\n",
      "FreqItemset(items=['c'], freq=3)\n",
      "FreqItemset(items=['c', 'a'], freq=3)\n",
      "FreqItemset(items=['e'], freq=2)\n",
      "FreqItemset(items=['e', 'c'], freq=1)\n",
      "FreqItemset(items=['e', 'c', 'a'], freq=1)\n",
      "FreqItemset(items=['e', 'a'], freq=2)\n",
      "FreqItemset(items=['b'], freq=2)\n",
      "FreqItemset(items=['b', 'e'], freq=1)\n",
      "FreqItemset(items=['b', 'e', 'a'], freq=1)\n",
      "FreqItemset(items=['b', 'c'], freq=1)\n",
      "FreqItemset(items=['b', 'c', 'a'], freq=1)\n",
      "FreqItemset(items=['b', 'a'], freq=2)\n",
      "FreqItemset(items=['d'], freq=1)\n",
      "FreqItemset(items=['d', 'e'], freq=1)\n",
      "FreqItemset(items=['d', 'e', 'a'], freq=1)\n",
      "FreqItemset(items=['d', 'b'], freq=1)\n",
      "FreqItemset(items=['d', 'b', 'e'], freq=1)\n",
      "FreqItemset(items=['d', 'b', 'e', 'a'], freq=1)\n",
      "FreqItemset(items=['d', 'b', 'a'], freq=1)\n",
      "FreqItemset(items=['d', 'a'], freq=1)\n",
      "FreqItemset(items=['f'], freq=1)\n",
      "FreqItemset(items=['f', 'c'], freq=1)\n",
      "FreqItemset(items=['f', 'c', 'a'], freq=1)\n",
      "FreqItemset(items=['f', 'a'], freq=1)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "data = [[\"a\", \"b\", \"c\"], [\"a\", \"b\", \"d\", \"e\"], [\"a\", \"c\", \"e\"], [\"a\", \"c\", \"f\"]]\n",
    "rdd = sc.parallelize(data, 2)\n",
    "\n",
    "model = FPGrowth.train(rdd, minSupport=0.2, numPartitions=10)\n",
    "result = model.freqItemsets().collect()\n",
    "for fi in result:\n",
    "    print(fi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SQLContext\n",
    "\n",
    "sqlContext = SQLContext(sc)\n",
    "\n",
    "#ignore this , it is just for testing\n",
    "toy_products_df = sqlContext.read.csv(\"./amazon_co-ecommerce_sample.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------+\n",
      "|                 _c0|                 _c1|                 _c2|                 _c3|                 _c4|              _c5|                 _c6|                 _c7|                 _c8|                 _c9|                _c10|                _c11|                _c12|                _c13|                _c14|            _c15|   _c16|\n",
      "+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------+\n",
      "|             uniq_id|        product_name|        manufacturer|               price|number_available_...|number_of_reviews|number_of_answere...|average_review_ra...|amazon_category_a...|customers_who_bou...|         description| product_information| product_description|items_customers_b...|customer_question...|customer_reviews|sellers|\n",
      "|eac7efa5dbd3d667f...|Hornby 2014 Catal...|              Hornby|               £3.42|               5 new|               15|                   1|  4.9 out of 5 stars|Hobbies > Model T...|http://www.amazon...|Product Descripti...|Technical Details...|Product Descripti...|http://www.amazon...|Does this catalog...|            null|   null|\n",
      "|            see more|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|    HiThe 2014 ca...|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|           see less\"|Worth Buying For ...|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|         Copnovelist|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 6 April 2014 ...| even if it inclu...|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|             richard|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 11 April 2015...| every credit to ...| a worthy referen...| as well as a sal...|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|         Pinkhandbag|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 23 April 2014...| so this has been...|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|    Gary John Mapson|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 11 Jun. 2014 ...|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|         David Baker|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 7 Dec. 2014 /...|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|         John A. Day|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 20 Mar. 2015 ...|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|           T. Davies|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "| on 7 Oct. 2014 /...|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "|         John Butlin|                null|                null|                null|                null|             null|                null|                null|                null|                null|                null|                null|                null|                null|                null|            null|   null|\n",
      "+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "toy_products_df.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reading amazon's toy products\n",
    "# it is just for testing\n",
    "product_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('./amazon_co-ecommerce_sample.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|        product_name|\n",
      "+--------------------+\n",
      "|Hornby 2014 Catal...|\n",
      "|                null|\n",
      "|                null|\n",
      "|Worth Buying For ...|\n",
      "|                null|\n",
      "| even if it inclu...|\n",
      "|                null|\n",
      "| every credit to ...|\n",
      "|                null|\n",
      "| so this has been...|\n",
      "+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "product_df.select('product_name').show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# getting product names and removing null values\n",
    "# it is just for testing\n",
    "product_name = product_df.where(product_df.product_name.isNotNull()).select('product_name')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|        product_name|\n",
      "+--------------------+\n",
      "|Hornby 2014 Catal...|\n",
      "|Worth Buying For ...|\n",
      "| even if it inclu...|\n",
      "| every credit to ...|\n",
      "| so this has been...|\n",
      "|\"{\"\"seller\"\"=>[{\"...|\n",
      "|FunkyBuys® Large ...|\n",
      "|\"{\"\"seller\"\"=>{\"\"...|\n",
      "|CLASSIC TOY TRAIN...|\n",
      "| it literally did...|\n",
      "+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "product_name.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Frequent pattern mining for user clicked items/transection items\n",
    "product_data_rdd = sc.textFile('./fp_product_data.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['82475 -1 84211 -1 86919 -1 86927 -1 86943 -1 -2',\n",
       " '56109 -1 222699 -1 -2',\n",
       " '55455 -1 -2',\n",
       " '81795 -1 81991 -1 -2',\n",
       " '55403 -1 55407 -1 55411 -1 55435 -1 55831 -1 55835 -1 55839 -1 55847 -1 55863 -1 55879 -1 55895 -1 55991 -1 56077 -1 56225 -1 56373 -1 56765 -1 222499 -1 222511 -1 222607 -1 222699 -1 -2']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "product_data_rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['82475',\n",
       "  '-1',\n",
       "  '84211',\n",
       "  '-1',\n",
       "  '86919',\n",
       "  '-1',\n",
       "  '86927',\n",
       "  '-1',\n",
       "  '86943',\n",
       "  '-1',\n",
       "  '-2'],\n",
       " ['56109', '-1', '222699', '-1', '-2'],\n",
       " ['55455', '-1', '-2'],\n",
       " ['81795', '-1', '81991', '-1', '-2'],\n",
       " ['55403',\n",
       "  '-1',\n",
       "  '55407',\n",
       "  '-1',\n",
       "  '55411',\n",
       "  '-1',\n",
       "  '55435',\n",
       "  '-1',\n",
       "  '55831',\n",
       "  '-1',\n",
       "  '55835',\n",
       "  '-1',\n",
       "  '55839',\n",
       "  '-1',\n",
       "  '55847',\n",
       "  '-1',\n",
       "  '55863',\n",
       "  '-1',\n",
       "  '55879',\n",
       "  '-1',\n",
       "  '55895',\n",
       "  '-1',\n",
       "  '55991',\n",
       "  '-1',\n",
       "  '56077',\n",
       "  '-1',\n",
       "  '56225',\n",
       "  '-1',\n",
       "  '56373',\n",
       "  '-1',\n",
       "  '56765',\n",
       "  '-1',\n",
       "  '222499',\n",
       "  '-1',\n",
       "  '222511',\n",
       "  '-1',\n",
       "  '222607',\n",
       "  '-1',\n",
       "  '222699',\n",
       "  '-1',\n",
       "  '-2']]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "product_ready_rdd = product_data_rdd.map(lambda x: x.split(\",\")).map(lambda x: x[0].split(\" \"))\n",
    "product_ready_rdd.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "77512"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "product_ready_rdd.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.mllib.fpm import FPGrowth\n",
    "import numpy as np\n",
    "\n",
    "# function for generating min Support\n",
    "def find_minSupport(x, a=0.4, b=0.2, c=0.2):\n",
    "    return np.finfo(np.exp([a * x + b])) + c\n",
    "\n",
    "# training the model\n",
    "unique = product_ready_rdd.map(lambda x: list(set(x))).cache()\n",
    "fp_model = FPGrowth.train(unique, minSupport=0.2, numPartitions=2)\n",
    "# getting frequent items sets/FP tree\n",
    "result = fp_model.freqItemsets().collect()\n",
    "\n",
    "# display top 10 sets of frequent items\n",
    "i = 0\n",
    "for fi in result:\n",
    "    if i <= 10:\n",
    "        print(fi)\n",
    "        i += 1\n",
    "    else:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FreqItemset(items=['-1'], freq=77512)\n",
      "FreqItemset(items=['-2'], freq=77512)\n",
      "FreqItemset(items=['-2', '-1'], freq=77512)\n"
     ]
    }
   ],
   "source": [
    "for fi in result:\n",
    "        print(fi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------+----+\n",
      "|    items|freq|\n",
      "+---------+----+\n",
      "|      [1]|   3|\n",
      "|      [2]|   3|\n",
      "|   [2, 1]|   3|\n",
      "|      [5]|   2|\n",
      "|   [5, 2]|   2|\n",
      "|[5, 2, 1]|   2|\n",
      "|   [5, 1]|   2|\n",
      "+---------+----+\n",
      "\n",
      "+----------+----------+------------------+\n",
      "|antecedent|consequent|        confidence|\n",
      "+----------+----------+------------------+\n",
      "|    [5, 2]|       [1]|               1.0|\n",
      "|       [2]|       [1]|               1.0|\n",
      "|       [2]|       [5]|0.6666666666666666|\n",
      "|    [2, 1]|       [5]|0.6666666666666666|\n",
      "|       [5]|       [2]|               1.0|\n",
      "|       [5]|       [1]|               1.0|\n",
      "|    [5, 1]|       [2]|               1.0|\n",
      "|       [1]|       [2]|               1.0|\n",
      "|       [1]|       [5]|0.6666666666666666|\n",
      "+----------+----------+------------------+\n",
      "\n",
      "+---+------------+----------+\n",
      "| id|       items|prediction|\n",
      "+---+------------+----------+\n",
      "|  0|   [1, 2, 5]|        []|\n",
      "|  1|[1, 2, 3, 5]|        []|\n",
      "|  2|      [1, 2]|       [5]|\n",
      "+---+------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.fpm import FPGrowth\n",
    "\n",
    "df = spark.createDataFrame([\n",
    "    (0, [1, 2, 5]),\n",
    "    (1, [1, 2, 3, 5]),\n",
    "    (2, [1, 2])\n",
    "], [\"id\", \"items\"])\n",
    "\n",
    "fpGrowth = FPGrowth(itemsCol=\"items\", minSupport=0.5, minConfidence=0.6)\n",
    "model = fpGrowth.fit(df)\n",
    "\n",
    "# Display frequent itemsets.\n",
    "model.freqItemsets.show()\n",
    "\n",
    "# Display generated association rules.\n",
    "model.associationRules.show()\n",
    "\n",
    "# transform examines the input items against all the association rules and summarize the\n",
    "# consequents as prediction\n",
    "model.transform(df).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
